#!/usr/bin/env perl

use strict;
use warnings;
use Getopt::Long qw(:config no_ignore_case bundling);
use FindBin;
use lib "$FindBin::RealBin";
use UnionFind;
use threads;
use Thread::Queue;
use IO::Handle;

my ($no_header, $compact, $dot_file, $length_unitigs_file, $unitigs_k, $bases, $DEBUG, $force, $help);
my $nb_threads   = 1;
my $min_density  = 0.05;
my $nb_errors    = 3;
my $overlap_play = 0.3;
my $output       = "mega_reads";
GetOptions("d|density=f"        => \$min_density,
           "H|no-header"        => \$no_header,
           "t|thread=i"         => \$nb_threads,
           "o|output=s"         => \$output,
           "c|compact"          => \$compact,
           "e|errors=f"         => \$nb_errors,
           "l|length-unitigs=s" => \$length_unitigs_file,
           "b|bases"            => \$bases,
           "k=i"                => \$unitigs_k,
           "p|overlap-play=f"   => \$overlap_play,
           "dot=s"              => \$dot_file,
           "force"              => \$force,
           "debug"              => \$DEBUG,
           "h|help"             => \$help) or die;

if(defined($help)) {
  print(<<EOS);
Find the longest (with most k-mers) path in the super-read overlap graph

 -d,--density FLOAT        Minimum density of k-mers ($min_density)
 -H,--no-header            Do not print line headers
 -t,--thread INT           Number of threads ($nb_threads)
 -o,--output PATH          Output prefix file ($output)
 -c,--compact              Compact output
 -e,--errors FLOAT         Number of average errors slack for implied position ($nb_errors)
 -l,--length-unitigs FILE  File containing unitigs length
 -b,--bases                Use bases counts instead of k-mer counts
 -k                        Length of k-mer for creating k-unitigs in super reads
 -p,--overlap-play FLOAT   Play in the overlap length difference between position and k-unitigs lengths ($overlap_play)
    --dot PREFIX           Write an overlap graph (dot file) for each PacBio read
 -h,--help                 This message
EOS
# ;
exit(0);
}
if(!defined($force)) {
  print(STDERR "longest_path_overlap_graph is deprecated. Use longest_path_overlap_graph2 in the src_jf_aligner directory\n",
        "Or use '--force' to ignore this warning\n");
  exit(1);
}

if(defined($length_unitigs_file) != defined($unitigs_k)) {
  print(STDERR "Switches --length-unitigs and -k must be supplied together.");
  exit(1);
}

sub round { int($_[0] + $_[0]/abs($_[0] * 2)); }

my $dot_io;
if(defined($dot_file)) {
  open($dot_io, ">", $dot_file) or die "Failed to open dot file '$dot_file': $!";
}


my @unitig_len;
if(defined($length_unitigs_file)) {
  open(my $ul_io, "<", $length_unitigs_file) or
      die "Can't open file '$length_unitigs_file': $!";
  while(<$ul_io>) {
    my ($index, $len) = split;
    push(@unitig_len, $len);
  }
  close($ul_io);
}

sub length_sr {
  my ($sr)    = @_;
  my @unitigs = split(/_/, $sr);
  my $len     = 0;
  $len += $unitig_len[substr($_, 0, -1)] foreach (@unitigs);
  $len - (@unitigs - 1) * ($unitigs_k - 1);
}

# Start threads
my $input_queue = Thread::Queue->new();
for(my $i = 0; $i < $nb_threads; $i++) { threads->create(\&thread_main); }

# Read input for worker threads
while(<>) {
  next unless /^>(\d+) (\S+)/;
  my ($nb_lines, $pb) = ($1, $2);
  my @lines :shared;
  for(my $i = 0; $i < $nb_lines; $i++) {
    my $line :shared = <>;
    push(@lines, \$line);
  }
  push(@lines, $pb);
  $input_queue->enqueue(\@lines);
}

# Terminate. Because '$input_queue->end()' does not exists in old
# versions of Perl, we send a sentinel value (an empty array)
foreach my $th (threads->list) { my @sentinel :shared; $input_queue->enqueue(\@sentinel); }
foreach my $th (threads->list) { $th->join; }


sub thread_main {
  my $output_file = $output . "_" . threads->tid();
  open(my $out, ">", $output_file) or die "Can't open output file '$output_file': $!";

  unless($no_header) {
    print($out "Rstart Rend Nmers Density");
    print($out " PB") unless $compact;
    print($out " MegaRead\n") unless $no_header;
  }
  while(my $lines = $input_queue->dequeue) {
    last unless @$lines;
    my $pbName = pop(@$lines);
    my @nodes;
    foreach my $line (@$lines) {
      my @flds = split (/\s/, $$line);
      my @info = splice(@flds, 0, 15);
      my @counts = map { my $x = /(\d+):(\d+)/; defined($bases) ? $2 : $1 } @flds;
      my $nmers = defined($bases) ? $info[8] : $info[4];
      my ($stretch, $offset, $avg_err) = @info[11..13];
      # LPATH -> Number of mers in longest path up to this node. LPREV ->
      # Previous node in longest path. LSTART -> starting node in longest
      # path. LCOUNT -> Number of nodes in longest path.
      my $node = { NAME => $info[14], NMERS => $nmers, NEIGHS => [], INDEG => 0,
                   RSTART => $info[0], REND => $info[1], QSTART => $info[2], QEND => $info[3],
                   RLEN => $info[9], QLEN => $info[10], MERCOUNTS => \@counts,
                   STRETCH => $stretch, OFFSET => $offset, AVG_ERR => $avg_err,
                   LPATH => 0, LPREV => undef, LSTART => undef };
      $$node{IMP_START} = $stretch + $offset;
      $$node{IMP_END}   = $stretch * $$node{QLEN} + $offset;
      #$$node{IMP_START} = $info[0]-$info[2]+1;
      #$$node{IMP_END}   = $info[1]+$info[10]-$info[3];

      push(@nodes, $node);
    }
    @nodes = sort { $$a{IMP_START} <=> $$b{IMP_START} } @nodes;
    for(my $i = 0; $i < @nodes; $i++) {
      $nodes[$i]{ID} = $i;
    }
    handle_graph(\@nodes, $pbName, $out);
  }
  close($out);
}

# Create the edges between the nodes. Returns the connected
# components.
sub create_edges {
  my ($nodes) = @_;

# An edge is a triplet [v, w, z] where v is the index of
# the id of the node and w is the weight (number of shared mers
# between nodes), and z is the number of shared k-unitigs.
  my $components = UnionFind->new();
  for(my $i       = 0; $i < @$nodes; ++$i) {
    my $node_i    = $$nodes[$i];
    next if $$node_i{IMP_END} >= $$node_i{RLEN}; # Hanging off 3' end of PacBio read: no overlap
    my $i_name    = $$node_i{NAME};
    $i_name =~ /(^|_)(\d+(F|R))$/;
    my $lastKUni  = $2;
    print "NAME i $i $i_name imp_s $$node_i{IMP_START} imp_e $$node_i{IMP_END}\n" if $DEBUG;
    for (my $j=$i+1; $j < @$nodes; ++$j) {
      my $node_j      = $$nodes[$j];
      my $position_len = $$node_i{IMP_END} - $$node_j{IMP_START};
      print "  name j $j $$node_j{NAME} implied start j $$node_j{IMP_START}\n" if $DEBUG;
      next if $$node_j{IMP_START} <= 1; # Hanging off 5' end of PacBio read: no overlap
      if($position_len * (1 + $overlap_play) < $unitigs_k) { # the maximum implied overlap is less than k-mer length 
        print "    small overlap\n" if $DEBUG;
        last;
      }
      my $j_name = $$node_j{NAME};
      next unless $j_name =~ /(?:^|_)($lastKUni)(?:_|$)/; # last unitig of i not in j -> no overlap
      next if $j_name eq $i_name; # refuse overlap with myself
      my $common_len     = $+[1];
      my $common_unitigs = substr($i_name, -$common_len);
      next unless $common_unitigs eq substr($j_name, 0, $common_len);
      print "  OVERLAP FOUND\n" if $DEBUG;
      $common_unitigs =~ s/(F|R)//g;
      my @unitigs     = split(/_/, $common_unitigs); # k-unitigs in common between nodes
      my $mers_shared = 0;
      my $mers_info   = $$node_j{MERCOUNTS};
      my $overlap_len = @unitig_len ? 0 : undef;
      print("  ovlap") if $DEBUG;
      for(my $k = 0; $k < @unitigs; ++$k) {
        $mers_shared += $$mers_info[2 * $k];
        $mers_shared -= $$mers_info[2 * $k - 1] if($k > 0);
        $overlap_len += $unitig_len[$unitigs[$k]] if defined($overlap_len);
        print(" $unitigs[$k]") if $DEBUG;
      }
      print("\n") if $DEBUG;
      
      if(defined($overlap_len)) {
        $overlap_len -= (@unitigs - 1) * ($unitigs_k - 1);
        my $err = $nb_errors * ($$node_j{AVG_ERR} + $$node_i{AVG_ERR});
        print("  OVERLAP LENGTH $overlap_len vs $position_len error play $overlap_play err $err\n") if $DEBUG;
        print("  ", (1 + $overlap_play) * $position_len + $err, " ", (1 + $overlap_play) * $overlap_len + $err, "\n") if $DEBUG;
        print("  ", (($position_len > (1 + $overlap_play) * $overlap_len  + $err)) ? "next" : "keep", "\n") if $DEBUG;
#        $overlap_len *= ($$node_i{STRETCH} + $$node_j{STRETCH}) / 2;
#        my $position_len = $$node_i{IMP_END} - $$node_j{IMP_START} + 1;
        if(($overlap_len  > (1 + $overlap_play) * $position_len + $err) ||
           ($position_len > (1 + $overlap_play) * $overlap_len  + $err)) {
          next;
        }
      }
      print "  ADDING EDGE\n" if $DEBUG;
      push(@{$$node_i{NEIGHS}}, [$j, $mers_shared, scalar(@unitigs)]);
      $$node_j{INDEG}++;
      $components->union($i, $j);
    }
  }

  return $components;
}

sub write_dot {
  my ($nodes, $pbName, $io) = @_;

  print($io "digraph \"$pbName\" {\n");
  print($io "  node [fontsize=10];\n");
  foreach my $n (@$nodes) {
    my $start_n   = $$n{LSTART};
    my $lp_len    = $$n{IMP_END} - $$start_n{IMP_START} + 1;
    my $density   = $$n{LPATH} / $lp_len;
    my $color     = "";
    if($$n{INDEG} == 0) {
      $color = "color=blue";
    } elsif(!@{$$n{NEIGHS}}) {
      $color = "color=green";
    }
    printf($io "  $$n{ID} [tooltip=\"$$n{NAME}\", label=\"$$n{ID} L$$n{QLEN} #$$n{NMERS}\\nP($$n{RSTART},$$n{REND}) S($$n{QSTART},$$n{QEND})\\nI($$n{IMP_START},$$n{IMP_END})\\nLP #$$n{LPATH} L%.1f d%.2g\"$color];\n", $lp_len, $density);
  }

  foreach my $n (@$nodes) {
    my @unitigs = split(/_/, $$n{NAME});
    foreach my $e (@{$$n{NEIGHS}}) {
      my $color = "";
      $color = ", color=\"red\"" if $$n{ON_LPATH} && $$nodes[$$e[0]]{ON_LPATH};
      my $common = join("_", @unitigs[-$$e[2]..-1]);
      print($io "  $$n{ID} -> $$e[0] [tooltip=\"$common\", label=\"$$e[1]\"$color];\n");
    }
  }
  print($io "}\n");
}

# Find the longest paths in the graph, for the set of nodes and
# connected components. Returns a hash with for each connected
# component, a reference the the last node of the longest path.
sub find_longest_paths {
  my ($nodes, $components) = @_;
# One longest path per weakly connected component. Reference to
# terminal node of path.
  my %longest_paths;

# The nodes have been created in topological order, because the
# coordinates are sorted according to PacBIO read position. So we can
# find the longest path by visiting each node in order once.
  foreach my $v (@$nodes) {
    # Bootstrap for starting nodes
    if($$v{INDEG} == 0) {
      $$v{LPATH}  = $$v{NMERS};
      $$v{LSTART} = $v;
    }
    my $cpath = $$v{LPATH};

    # Record longest path for ending nodes
    if(@{$$v{NEIGHS}} == 0) {
      # If both start and end node, single node in component, don't
      # record a longest path
      #next if $$v{INDEG} == 0;
      my $comp = $components->find($$v{ID});
      my $lp   = $longest_paths{$comp};
      if(!defined($lp) || $cpath > $$lp[0]{LPATH}) {
        $longest_paths{$comp} = [$v];
      } elsif(defined($lp) && $cpath == $$lp[0]{LPATH}) {
        push(@$lp, $v);
      }
      next;
    }
    
    # For interior nodes, updates neighbors
    foreach my $e (@{$$v{NEIGHS}}) {
      my $nv     = $$nodes[$$e[0]];
      my $ncp    = $cpath + $$nv{NMERS} - $$e[1];
      if(!defined($$nv{LSTART}) ||
         $ncp > $$nv{LPATH} || 
         ($ncp == $$nv{LPATH} && $$v{LSTART}{IMP_START} > $$nv{LSTART}{IMP_START})) {
        $$nv{LPATH}  = $ncp;
        $$nv{LSTART} = $$v{LSTART};
        $$nv{LPREV}  = $v;
      }
    }
  }
  return %longest_paths
}

# Print the longest paths
sub print_longest_paths {
  my ($longest_paths, $pbname, $out) = @_;

  print($out ">$pbname\n") if $compact;
# For each connected component, get the longest paths (given by their last node $end_n).
  while(my ($comp, $end_nodes) = each %$longest_paths) {
    my ($max_end_n, $max_density) = (undef, 0);
    foreach my $end_n (@$end_nodes) {
      my $start_n = $$end_n{LSTART};
      my $density;
      my $len = ($$end_n{IMP_END} > $$end_n{RLEN} ? $$end_n{RLEN} : $$end_n{IMP_END}) - ($$start_n{IMP_START} < 1 ? 1 : $$start_n{IMP_START});
      $density = $len > 0 ? $$end_n{LPATH} / $len : 0;
 
      if($density > $max_density) {
        $max_density = $density;
        $max_end_n   = $end_n;
      }
    }
    next if($max_density < $min_density);

    my @path;
    my $cur_n   = $max_end_n;
    my $start_n = $$max_end_n{LSTART};
    while(defined($cur_n)) {
      $$cur_n{ON_LPATH} = 1;
      push(@path, $cur_n);
      $cur_n = $$cur_n{LPREV};
    }

    my $path_len = length_sr($path[-1]{NAME});
    my $mega_read = $path[-1]{NAME};
    for(my $i = $#path; $i > 0; $i--) {
      my $v = $path[$i];
      foreach my $n (@{$$v{NEIGHS}}) {
        next unless $$n[0] == $path[$i - 1]{ID};
        my $w = $path[$i - 1];
        my @srs = split(/_/, $$w{NAME});
        splice(@srs, 0, $$n[2]);
        last unless @srs;
        $path_len += length_sr(join("_", @srs)) - ($unitigs_k - 1);
        $mega_read .= "_" . join("_", @srs);
        last;
      }
    }
    printf($out "%.2f %.2f %d %d %d %d %d %.4f ",
           $$start_n{IMP_START}, $$max_end_n{IMP_END},
           $$start_n{RSTART}, $$max_end_n{REND},
           $$start_n{QSTART}, $path_len - ($$max_end_n{QLEN} - $$max_end_n{QEND}),
           $$max_end_n{LPATH}, $max_density);
    print($out "$pbname ") unless($compact);
    print($out " $mega_read $path_len\n");
  }
}

sub handle_graph {
  my ($nodes, $pbname, $out) = @_;
  return unless @$nodes;
  my $components = create_edges($nodes);
  my %longest_paths = find_longest_paths($nodes, $components);
  print_longest_paths(\%longest_paths, $pbname, $out);
  write_dot($nodes, $pbname, $dot_io) if(defined($dot_io));
}
